@article{https://doi.org/10.1002/mp.17818,
author = {Yu, Songli and Li, Yunxiang and Jiao, Pengfei and Liu, Yixiu and Zhao, Jianxiang and Yan, Chenggang and Wang, Qifeng and Wang, Shuai},
title = {A CNN-transformer-based hybrid U-shape model with long-range relay for esophagus 3D CT image gross tumor volume segmentation},
journal = {Medical Physics},
volume = {52},
number = {7},
pages = {e17818},
keywords = {computed tomography, deep learning, esophageal, gross tumor volume, image segmentation},
doi = {https://doi.org/10.1002/mp.17818},
url = {https://aapm.onlinelibrary.wiley.com/doi/abs/10.1002/mp.17818},
eprint = {https://aapm.onlinelibrary.wiley.com/doi/pdf/10.1002/mp.17818},
abstract = {Abstract Background Accurate and reliable segmentation of esophageal gross tumor volume (GTV) in computed tomography (CT) is beneficial for diagnosing and treating. However, this remains a challenging task because the esophagus has a variable shape and extensive vertical range, resulting in tumors potentially appearing at any position within it. Purpose This study introduces a novel CNN-transformer-based U-shape model (LRRM-U-TransNet) designed to enhance the segmentation accuracy of esophageal GTV. By leveraging advanced deep learning techniques, we aim to address the challenges posed by the variable shape and extensive range of the esophagus, ultimately improving diagnostic and treatment outcomes. Methods Specifically, we propose a long-range relay mechanism to converge all layer feature information by progressively passing adjacent layer feature maps in the pixel and semantic pathways. Moreover, we propose two ready-to-use blocks to implement this mechanism concretely. The Dual FastViT block interacts with feature maps from two paths to enhance feature representation capabilities. The Dual AxialViT block acts as a secondary auxiliary bottleneck to acquire global information for more precise feature map reconstruction. Results We build a new esophageal tumor dataset with 1665 real-world patient CT samples annotated by five expert radiologists and employ multiple evaluation metrics to validate our model. Results of a five-fold cross-validation on this dataset show that LRRM-U-TransNet achieves a Dice coefficient of 0.834, a Jaccard coefficient of 0.730, a Precision of 0.840, a HD95 of 3.234 mm, and a Volume Similarity of 0.143. Conclusions We propose a CNN-Transformer hybrid deep learning network to improve the segmentation effect of esophageal tumors. We utilize the local and global information between shallower and deeper layers to prevent early information loss and enhance the cross-layer communication. To validate our model, we collect a dataset composed of 1665 CT images of esophageal tumors from Sichuan Tumor Hospital. The results show that our model outperforms the state-of-the-art models. It is of great significance to improve the accuracy and clinical application of esophageal tumor segmentation.},
year = {2025}
}

